########################################################
# Project:    Commission Communication
# Task:       Create balanced, topics-macthed sample of
#             executive PRs and compare understandability
# Author:     Christian Rauh (01.02.2021)
########################################################

# Intuition
# Worse clarity might hinge on the fact that Comm talks about inherently 
# more 'complex' topics, i.e. topic distribution 'confounds' DV
# I thus engage in creating a balanced sample along the topics in COMM and UK/IRE PRS
# Following this work: https://onlinelibrary.wiley.com/doi/full/10.1111/ajps.12526?casa_token=QGbL_R2026UAAAAA%3AFGsvDIMFendg79CHd8o8_LRbLkxz8DoOp1z8tP3TwM7NV9ZBM3GcbewoSYo-zGiu9CsnZ34CWLCcYBg

# Packages ####
library(tidyverse) # 1.3.0
library(Hmisc) # 4.4-1
library(cowplot) # 1.1.0
library(textcat) #1.0-7
library(spacyr) # 1.2.1
library(quanteda) # 3.2.0
library(stm) # 1.3.6

# Textmatching package 
# No longer on CRAN because of a dependency issue, dl and install archived version first
# url <- "https://cran.r-project.org/src/contrib/Archive/textmatching/textmatching_0.1.0.tar.gz"
# pkgFile <- "./Tools/textmatching_0.1.0.tar.gz"
# download.file(url = url, destfile = pkgFile)
# install.packages(c("stm", "cem")) # Dependencies
# install.packages(pkgs=pkgFile, type="source", repos=NULL)
library(textmatching)


# # Raw corpus ####
# # Select PRs from a similar time period (for domestic executives, shorter time frame available only)
# 
# # Individual corpora
# # Varible construction follows exactly the approach for calculating the clarity indicators
# # See PRs-TextIndicators.R
# 
# com <- read_rds("./Corpora/EC-PressReleases_1985-2020_clean.RDS") %>%
#   mutate(sender = "Commission") %>%
#   select(c(sender, date, month, text))
# com$cid <- 1:nrow(com) # Identical id as in extraction of text indicators
# 
# 
# uk <- read_rds("./Corpora/UK-GovPressReleases.Rds")%>%
#   filter(speech == F,
#          str_count(text, "\\. ") > 2, # faulty scrapes do not contain multiple sentences
#          str_detect(text, "(T|t)he "),
#          !str_detect(text, "PDF, [0-9]{1,4}KB"), # Mostly reference to files only
#          !str_detect(text, "HTML Details"), # Same
#          !str_detect(text, "Kick off time")) %>% # Lists of football travel advice
#   mutate(text = str_remove(paste0(headline, ". ", text), " Curriculum Vitae .*?$")) %>%
#   mutate(sender = "UK") %>%
#   mutate(month = str_remove(as.character(date), "-[0-9]{2}$")) %>%
#   select(c(sender, date, month, text))
# uk$cid <- 1:nrow(uk) # Identical id as in extraction of text indicators
# 
# 
# ire <- read_rds("./Corpora/IRE-GovPressReleases.Rds")%>%
#   mutate(text = paste0(headline, ". ", text) %>%
#            str_remove(fixed("Share . Email . Facebook . Twitter")) %>%
#            str_remove_all(fixed("•"))) %>%
#   mutate(sender = "IRE") %>%
#   mutate(month = str_remove(as.character(date), "-[0-9]{2}$")) %>%
#   select(c(sender, date, month, text))
# ire$cid <- 1:nrow(ire) # Identical id as in extraction of text indicators
# 
# # Filter non-english sentences
# # Irish politicians often use Irish bits which would offset the grammar and lang indicators
# 
# for (i in 1:nrow(ire)){
# 
#   # Progress
#   print(round((i/nrow(ire))*100, 2))
# 
#   # Sentence tokenizer
#   df <- spacy_tokenize(ire$text[i], what = "sentence") %>%
#     data.frame() %>%
#     rename(sentences = 1)
# 
#   # Language detection
#   df$lang <- textcat(df$sentences)
# 
#   # Drop non-english sentences
#   df <- df %>% filter(lang == "english")
# 
#   # Rebuild and store text
#   ire$text[i] <- paste(df$sentences, collapse = " ")
# }
# 
# 
# # Combine corpora and inspect temporal overlap
# corp <- rbind(com, uk, ire)
# 
# # PRs per sender over time
# ggplot(corp, aes(x = month)) +
#   geom_bar(aes(y = (..count..))) +
#   scale_x_discrete(breaks = paste0(seq(1985, 2021, 1), "-01"))+
#   facet_wrap(.~sender, ncol = 1, scales = "free_y") +
#   theme(axis.text.x = element_text(angle = 90, vjust = .5))
# 
# ggplot(corp[corp$month > "2006-01", ], aes(x = month)) +
#   geom_bar(aes(y = (..count..))) +
#   scale_x_discrete(breaks = paste0(seq(2006, 2021, 1), "-01"))+
#   facet_wrap(.~sender, ncol = 1, scales = "free_y") +
#   theme(axis.text.x = element_text(angle = 90, vjust = .5))
# 
# # So UK consisitently only avaialble from early 2010, IR from 2018 only
# # Both periods in which COMM PR output was declining
# # I choose to cut-off with the beginning of 2010, which suggest an overrepresentation
# # of the UK and an underreprensentation of IRE
# # that shouldn't matter as we match on Com vs both nat executives anyway ...
# 
# # Combined 'raw' corpus
# 
# raw <- corp %>%
#   filter(month >= "2010-01") %>%
#   mutate(commission = ifelse(sender == "Commission", T, F))
# 
# ggplot(raw, aes(x = sender))+
#   geom_bar(aes(y = ..count..))
# 
# # Clean up
# # rm(list = c("com", "ire", "uk"))
# 
# 
# # Merge understandability indicators
# 
# # Commission
# com <- raw %>%  filter(sender == "Commission") %>% rename(cid = id)
# comdf <- read_rds("./data/PR-Comm_Language.Rds") %>%
#   select(c(meanSentenceLength, n_verb, n_noun, ntoken, flesch, familiarity)) %>%
#   mutate(cid = row_number())
# com <- merge(com, comdf, by = "cid", all.x = T)
# 
# # UK
# uk <- raw %>%  filter(sender == "UK") %>% rename(cid = id)
# ukdf <- read_rds("./data/PR-UK_Language.Rds") %>%
#   select(c(meanSentenceLength, n_verb, n_noun, ntoken, flesch, familiarity)) %>%
#   mutate(cid = row_number())
# uk <- merge(uk, ukdf, by = "cid", all.x = T)
# 
# # IRE
# ire <- raw %>%  filter(sender == "IRE") %>% rename(cid = id)
# iredf <- read_rds("./data/PR-IRE_Language.Rds") %>%
#   select(c(meanSentenceLength, n_verb, n_noun, ntoken, flesch, familiarity)) %>%
#   mutate(cid = row_number())
# ire <- merge(ire, iredf, by = "cid", all.x = T)
# 
# 
# # Re-combine corpus
# raw <- rbind(com, uk, ire)
# 
# # Clean up
# # rm(list = c("com", "ire", "uk", "comdf", "ukdf", "iredf"))
# 
# # Filter corpus
# # As in generating the main descriptive plot (PRs_DescriptivePlot.R)
# raw <- raw %>%
#   filter(meanSentenceLength > 3, # Average sentence length below three tokens is implausible
#          meanSentenceLength < 150) %>% # Average sentence above 150 tokens is implausible
#   filter(n_noun > 1 & n_verb > 1) %>% # At least one verb and one noun
#   filter(ntoken >= 5) %>%  # At least 5 words
#   filter(text != "")
# 
# 
# # Date filter (above string filter didn't work in dyplr context)
# 
# raw$date2 <- as.Date(raw$date)
# min(raw$date2)
# max(raw$date2)
# raw <- raw %>%  filter(date2 >= as.Date("2010-01-01"))
# 
# 
# # Export
# write_rds(raw, "./Corpora/Overlap-EC-UK-IRE.rds")


# Reload save object 
raw <- read_rds("./Corpora/Overlap-EC-UK-IRE.rds")
table(raw$sender)
min(raw$date)
max(raw$date)

# # Diverging word usage ####
# 
# # Relative word frequencies
# corp <- corpus(raw$text, docvars = raw[,c("sender", "commission")])
# mat <- dfm(corp, tolower = T, stem = F, split_hyphens = T,
#            remove_punct = T, remove_numbers = T, remove_symbols = T, remove = stopwords("english"),
#            verbose = T)
# wmat <- dfm_weight(mat, scheme = "prop")
# 
# # Get differences in word usage between Comm and nat executives
# # DF of differences in word usages
# typefreq <- dfm_group(mat, groups = docvars(wmat)$commission) %>%    # Group by Commission as sender (see above)
#   dfm_weight(scheme = "prop") %>% 
#   t() %>%                                                             # Transpose matrix
#   convert(to = "data.frame") %>% 
#   rename(word = doc_id,                                               # Transposition nuissance as first column was doc_id before
#          nat = `FALSE`,
#          com = `TRUE`) %>%                                  
#   mutate(diff = com - nat,                                            # Difference in relative term freqs between Hitler and BT
#          uniqc = ifelse(nat == 0 & com != 0, T, F),                   # Word only used by Com, but not by nat executives
#          uniqn = ifelse(nat != 0 & com == 0, T, F)) %>%               # Word only used by nat executives, but not by Com
#   arrange(desc(diff))
# 
# # Unique words in groups
# uniqcom <- typefreq %>% filter(uniqc) %>% arrange(desc(com)) %>% rename(relfreq = com) %>% 
#   mutate(label = "Freq. used by the Commission\nbut never by nat. governments\nTop-30") %>% select(word, relfreq, label)
# uniqnat <- typefreq %>% filter(uniqn) %>% arrange(desc(nat)) %>% rename(relfreq = nat) %>% 
#   mutate(label = "Freq. used by nat. govs.,\nbut never by the Commission\nTop-30") %>% select(word, relfreq, label)
# 
# topuniqwords <- rbind(head(uniqcom, 30),
#                       head(uniqnat, 30))
# 
# ggplot(topuniqwords, aes(x=relfreq, y = reorder(word, relfreq)))+
#   geom_col()+
#   facet_wrap(.~label, scales = "free") +
#   labs(title = "Unique Words in Commission and national press releases",
#        x= "Relative word frequencies in texts of respective groups",
#        y= "")+
#   theme_bw()+
#   theme(strip.text = element_text(face = "bold"),
#         axis.text.y = element_text(color = "black"))
# 
# ggsave("./Plots/UniqueWords-COMvsNAT.png", width = 16, height = 12, units = "cm")
# 
# 
# # Rel freq diffs
# typefreq2 <- typefreq %>% filter(!uniqn & !uniqc) %>% # Exclude words that are unique for either group
#   arrange(desc(diff))
# 
# strongdiff <- rbind(head(typefreq2, 30),      # Top 30 with highest diff (Com overweight)
#                     tail(typefreq2, 30)) %>%  # Top 30 with lowest diff (Nat overweight)
#   arrange(desc(diff)) %>% 
#   mutate(run = rep(seq(1,30,1), 2))          # Sequence to place terms neatly onto the plot (no meaning)
# 
# ggplot(strongdiff, aes(x=diff, y = run))+ 
#   geom_vline(xintercept = 0, linetype = "dashed")+
#   geom_text(aes(label = word))+
#   labs(title = "Which words do nat. governments and the Commission use in different frequencies?",
#        subtitle = "The Top-30 with the strongest differences in frequency, respectively. Only words used by both groups considered here.",
#        x = "Difference in relative word frequency in press releases\nby the Commission and UK/IRE governments",
#        y= "")+
#   theme_bw()+
#   theme(axis.text.y = element_blank(),
#         panel.grid.major.y = element_blank(),
#         panel.grid.minor.y = element_blank(),
#         axis.ticks.y = element_blank())
# 
# ggsave("./Plots/DivisiveWords-COMvsNAT.png", width = 22, height = 12, units = "cm")




# Topic model ####
# See: https://raw.githubusercontent.com/bstewart/stm/master/vignettes/stmVignette.pdf

set.seed(20210201)

raw$commission <- ifelse(raw$sender == "Commission", 1, 0)
table(raw$commission)

# Pre-processing text data for stm 
# for efficiency reasons, I do this with quanteda
qcorp <- corpus(raw$text, docvars = raw[ ,c("cid", "sender", "commission")])

# DFM: stemmed, lowered, no stopwords
# only words that appear in at least 5% and max 95% of docs
qdfm <- dfm(qcorp, tolower = T,
            remove = stopwords("english"),
            remove_punct = T, 
            remove_numbers = T,
            remove_symbols = T,
            verbose = T) %>% 
  dfm_trim(min_docfreq = 0.01, max_docfreq = 0.99, docfreq_type = "prop")

# Convert to stm typed object
sdfm <- convert(qdfm, to = "stm")


# Fit model

# # With content co-variate
# topicfit <- stm(documents = sdfm$documents, vocab = sdfm$vocab,
#                 K = 20, content = sdfm$meta$commission,
#                 max.em.its = 75, data = sdfm$meta,
#                 init.type = "Spectral", verbose = T)

# Export topicfit
# write_rds(topicfit, "./Data/STM_fit_OverlapCorpus.rds")

# Reload
topicfit <- read_rds("./Data/STM_fit_OverlapCorpus.rds")


# Assess and store topic content ####

# Overall topic distribution in corpus
# With six most probably words per topic
plot(topicfit, type = "summary", n = 6, xlim = c(0, .3)) # 

# Topic words and labels
tlabels <- labelTopics(topicfit)

# # Get exemplary text per topic
# # n top-rated theta for topic
# findThoughts(topicfit, # The stm model
#              texts = raw$text, # Source of the raw text
#              n = 1, # Number of texts displayed per topic
#              topic = 5) # Topic select (defaults to all, accepts c() vectors)
#              # where = sender =="Commission") # Topic select (defaults to all, accepts c() vectors)


# Topic disribution across documents
topdstr <- topicfit$theta %>% as.data.frame() # Topic proportions by document
toptop <- colnames(topdstr)[apply(topdstr,1,which.max)] # Topic with highest proportion per doc
raw$toptopic <- toptop # Indicate most likely topic for each text in the overlap sample



# Understandability by topic ####

# Get data on understandability indicators 
# by author and main topic of press releases
tu <- raw %>% mutate(verbal = n_verb/n_noun,
                     commission = as.logical(commission)) %>% 
  select(c(commission, toptopic, flesch, familiarity, verbal)) %>% 
  pivot_longer(cols = 3:5)

# Calculate topic means by actor type
nat.agg <- tu %>% filter(commission == 0) %>% 
  group_by(toptopic, name) %>% 
  summarise(natmean = mean(value))

com.agg <- tu %>% filter(commission == 1) %>% 
  group_by(toptopic, name) %>% 
  summarise(commean = mean(value))

# Get difference between actor types within topic
agg <- merge(nat.agg, com.agg, by = c("toptopic", "name")) %>% 
  mutate(diff = commean - natmean,
         direc = ifelse(diff < 0, "EC PRs with lower average\nthan national PRs", "EC PRs with higher average\nthan national PRs"))

# Construct topic labels with 6 most probable words
twords <- tlabels$topics %>% as.data.frame() %>% 
  mutate(tnr = row_number())
twords$tlabel <- paste(twords$V1, twords$V2, twords$V3, sep = ", ")
twords$tlabel <- paste0(twords$tlabel, ",\n", twords$V4, ", ", twords$V5, ", ", twords$V6)
twords$tlabel <- paste0("T", twords$tnr, ": ", twords$tlabel)

# ... and merge to understandability data
agg$tnr <- str_remove(agg$toptopic, "V")
agg <- merge(agg, twords[, c("tnr", "tlabel")], by = "tnr", all.x = T)

# Determine labels and order of indicators
agg$name <- agg$name %>% 
  recode("flesch" = "Easy to read?\nFlesch/Kincaid score",
         "familiarity" = "Familiar words?\nAverage Google Books frequency",
         "verbal" = "Focus on action?\nRatio of verbs to nouns") 

agg$name2 <- factor(agg$name, 
                       levels = c("Easy to read?\nFlesch/Kincaid score",
                    "Familiar words?\nAverage Google Books frequency",
                    "Focus on action?\nRatio of verbs to nouns"))

# Determine topic order
toporder <- agg %>%  select(tnr, tlabel) %>% unique() %>% arrange(desc(as.numeric(tnr)))
agg$tlabel <- agg$tlabel %>% factor(levels = toporder$tlabel)
rm(toporder)
  
# Plot difference in understandability indicators by topic (Figure 3)
# Color version
ggplot(data = agg, aes(y = tlabel, x = diff, fill = direc))+
  geom_rect(ymin =.5, ymax = 1.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =2.5, ymax = 3.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =4.5, ymax = 5.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =6.5, ymax = 7.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =8.5, ymax = 9.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =10.5, ymax = 11.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =12.5, ymax = 13.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =14.5, ymax = 15.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =16.5, ymax = 17.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =18.5, ymax = 19.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_col(width = .5)+
  # geom_linerange(aes(xmin = Lower, xmax = Upper), position = position_dodge(width = .5))+
  geom_vline(xintercept = 0, linetype = "dotted")+
  labs(y = "Estimated main topic\nof press releases\n ",
       x = " \nAverage difference between press releases of the European Commission and nat. governments (UK/IRE)\non the respective indicator\n ",
       fill = "",
       caption = "Based on full texts of 103,443 press releases (Commission: 11,837; UK: 84,940; IRE: 6,666) published between January 2010 and January 2021.\nTopic classification based on a Structural Topic Model with a content covariate accounting for wording differences between Commission and national PRs.\nLabels show the six most probable words within each topic.")+
  scale_fill_manual(values = c("#003399", "#FFCC00"))+
  facet_grid(.~name2, scales = "free_x")+
  theme_bw()+
  theme(legend.position = "bottom",
        strip.text = element_text(face = "bold"),
        axis.text.y = element_text(hjust = 0),
        text = element_text(family = "serif"))

ggsave("./Plots/Figure3_color.png", width = 24, height = 20, units = "cm")


# Plot difference in understandability indicators by topic (Figure 3)
# Greyscale version
ggplot(data = agg, aes(y = tlabel, x = diff, fill = direc))+
  geom_rect(ymin =.5, ymax = 1.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =2.5, ymax = 3.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =4.5, ymax = 5.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =6.5, ymax = 7.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =8.5, ymax = 9.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =10.5, ymax = 11.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =12.5, ymax = 13.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =14.5, ymax = 15.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =16.5, ymax = 17.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_rect(ymin =18.5, ymax = 19.5, xmin = -Inf, xmax = Inf, fill ="gray50", alpha = 0.01)+
  geom_col(width = .5)+
  # geom_linerange(aes(xmin = Lower, xmax = Upper), position = position_dodge(width = .5))+
  geom_vline(xintercept = 0, linetype = "dotted")+
  labs(y = "Estimated main topic\nof press releases\n ",
       x = " \nAverage difference between press releases of the European Commission and nat. governments (UK/IRE)\non the respective indicator\n ",
       fill = "",
       caption = "Based on full texts of 103,443 press releases (Commission: 11,837; UK: 84,940; IRE: 6,666) published between January 2010 and January 2021.\nTopic classification based on a Structural Topic Model with a content covariate accounting for wording differences between Commission and national PRs.\nLabels show the six most probable words within each topic.")+
  scale_fill_manual(values = c("black", "grey50"))+
  facet_grid(.~name2, scales = "free_x")+
  theme_bw()+
  theme(legend.position = "bottom",
        strip.text = element_text(face = "bold"),
        axis.text.y = element_text(hjust = 0),
        text = element_text(family = "serif"))

ggsave("./Plots/Figure3_greyscale.png", width = 24, height = 20, units = "cm")




# CEM matching ####

refitted <- refit(topicfit, sdfm$documents, content_level = "1")
projection <- project(topicfit, sdfm$documents)
matched <- cem_match(refitted,projection=projection, sdfm$meta$commission,
                     projection_breaks=2)
matched

# Get indices of matched texts
sample.indices <- matched$matched %>% 
  as.data.frame() %>% 
  rename(matched = 1) %>% 
  mutate(index = row_number()) %>% 
  filter(matched == T)

# Get matched sample data
msample <- raw[c(sample.indices$index), ]

# Compare understandability indicators
# in matched sample

df <- msample %>% 
  mutate(verbal = n_verb/n_noun,
         author = ifelse(commission ==1, "European Commission", "Nat. government (UK/IRE)")) %>% 
  select(author, flesch, familiarity, verbal) %>% 
  pivot_longer(cols = 2:4)

df$name <- df$name %>% 
  recode("flesch" = "Easy to read?\nFlesch/Kincaid score",
         "familiarity" = "Familiar words?\nAverage Google Books frequency",
         "verbal" = "Focus on action?\nRatio of verbs to nouns") 

df$name2 <- factor(df$name, 
                    levels = c("Easy to read?\nFlesch/Kincaid score",
                               "Familiar words?\nAverage Google Books frequency",
                               "Focus on action?\nRatio of verbs to nouns"))

ggplot(df, aes(x = value, y = author, colour = author))+
  stat_summary(geom = "linerange", fun.data = "mean_cl_boot", size = 1.1) +
  stat_summary(geom = "point", fun = "mean", size = 2.5) +
  facet_wrap(.~name2, scales = "free_x", ncol = 1)+
  scale_color_manual(values = c("#003399", "#FFCC00"))+
  labs(title = "Comparing language of Commission and national press releases\nin a sample matched on topic distributions in full texts\n ",
       x = "Mean value of respective indicator and 95% c.i.s",
       y = "",
       color = "Press release author: ",
       caption = "Coarsened exact match on topic distributions from a k=20 STM with a content covariate accounting for wording differences between Commission and national press releases.\n82,132 press releases (Commission: 11,321; Nat. gov.: 70,811) published between January 2010 and January 2021.")+
  theme_bw()+
  theme(legend.position = "bottom",
        axis.text.y = element_blank(),
        strip.text = element_text(face = "bold"),
        text = element_text(family = "serif"),
        plot.title = element_text(hjust = .5, face = "bold"))

ggsave("./Plots/FigureA4_TopicMatchedComparision.png", width = 24, height = 15, units = "cm")

